By Chuanyun (Clara) Zang
The analysis aimed to solve a Kaggle competition problem – Web Traffic Time Series Forecasting. See https://www.kaggle.com/c/web-traffic-time-series-forecasting.
Long Short Term Memory network and some basic statistic method is used in the analysis.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.callbacks import ModelCheckpoint
data = pd.read_csv('train_1.csv')
data.head()
data.info()
data.tail()
key = pd.read_csv('key_1.csv')
key_split = key.copy()
key_split['date'] = key_split.Page.apply(lambda a: a[-10:])
key_split['Page'] = key_split.Page.apply(lambda a: a[:-11])
key_split.head()
from keras import backend as K
def smape(y_true, y_pred):
diff = K.abs((y_true - y_pred) / K.clip(K.abs(y_true)+K.abs(y_pred),
K.epsilon(),
None))
return 200. * K.mean(diff, axis=-1)
def smape_test(y_true, y_pred):
denominator = (y_true+y_pred)/200.0
diff = np.abs(y_true-y_pred)/denominator
diff[denominator==0] = 0.0
return np.mean(diff)
data0 = data.copy()
#data0.describe(include='all')
data0.iloc[:,1:].isnull().sum(axis=0).describe()
#data0.isnull().sum(axis=1).describe()
((data0.isnull().sum(axis=1) == 0)).sum()
plt.hist(data0.isnull().sum(axis=1))
plt.xlabel('missing days')
plt.ylabel('number of Pages')
plt.title('Distribution of Pages along Missing Values')
data0 = data0.fillna(0)
days = range(550)
def plot_entry(idx):
data_temp = data0.iloc[idx,1:]
fig = plt.figure(1,figsize=(10,5))
plt.plot(days,data_temp)
plt.xlabel('day')
plt.ylabel('views')
plt.title(data0.iloc[idx,0])
plt.show()
for idx in [0, 5, 50, 10000, 50000]:
print(idx)
print(data0.iloc[idx,0])
plot_entry(idx)
plot_entry(4)
data = data.fillna(0)
#X_train = data[data.columns[1:-120]]
y_train_true = data[list(data.columns[-120:-60])]
#X_validate = data1[list(data1.columns[61:-60])]
y_validate_true = data[list(data.columns[-60:])]
#X_test = data1[list(data1.columns[121:])]
data1 = data.copy()
data1['index1'] = data1.index
data1 = data1[['index1']+list(data1.columns)[1:-1]]
data1.head()
### Take log(x+1)
data1[data1.columns[1:]] = data1[data1.columns[1:]].apply(lambda x: np.log(x+1))
data1.head()
X_train = data1[data1.columns[1:-120]]
y_train = data1[list(data1.columns[-120:-60])]
X_validate = data1[list(data1.columns[61:-60])]
y_validate = data1[list(data1.columns[-60:])]
X_test = data1[list(data1.columns[121:])]
print('X_train: ', X_train.shape)
print(X_train.head(1))
print('y_train: ', y_train.shape)
print(y_train.head(1))
print('X_validate: ', X_validate.shape)
print(X_validate.head(1))
print('y_validate: ', y_validate.shape)
print(y_validate.head(1))
print('X_test: ', X_test.shape)
print(X_test.head(1))
### Normalization X so that visits in each day are in the same range
## ? yes
#range_train = (X_train.max()-X_train.min()).replace('0', '1').astype('float64')
X_train_norm = (X_train - X_train.min())/(X_train.max()-X_train.min())
#y_train_norm = (y_train - y_train.mean())/y_train.std()
#X_validate_norm = (X_validate.max()-X_validate.min()).replace('0', '1').astype('float64')
X_validate_norm = (X_validate - X_validate.min())/(X_validate.max()-X_validate.min())
X_test_norm = (X_test - X_test.min())/(X_test.max()-X_test.min())
print('X_train_norm: ', X_train_norm.shape, X_train_norm.head(1))
print('y_validate: ', y_validate.shape, y_validate.head(1))
## reshape after normalized
X_train1 = X_train_norm.values.reshape(X_train.shape[0], 1, X_train.shape[1])
X_validate1 = X_validate_norm.values.reshape(X_validate.shape[0], 1, X_validate.shape[1])
y_train1 = y_train.values
y_validate1 = y_validate.values
X_test1 = X_test_norm.values.reshape(X_test.shape[0], 1, X_test.shape[1])
print('X_train: ', X_train1.shape)
print(X_train1[0])
print('y_train: ', y_train1.shape)
print(y_train1[0])
print('X_validate: ', X_validate1.shape)
print(X_validate1[0])
print('y_validate: ', y_validate1.shape)
print(y_validate1[0])
print('X_test: ', X_test1.shape)
print(X_test1[0])
LSTM_model = Sequential()
LSTM_model.add(LSTM(256, input_shape = (1,X_train.shape[1])))
LSTM_model.add(Dropout(0.3))
LSTM_model.add(Dense(60))
LSTM_model.compile(loss = 'mean_absolute_error', optimizer = 'rmsprop')
#LSTM_model.compile(loss = smape, optimizer = 'rmsprop')
LSTM_model.summary()
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
SVG(model_to_dot(LSTM_model).create(prog='dot', format='svg'))
epochs = 10
checkpointer = ModelCheckpoint(filepath='weights.best.from_scratch1.hdf5', verbose=1, save_best_only=True)
print('Start training...')
LSTM_model.fit(X_train1, y_train1, validation_data=(X_validate1, y_validate1),
epochs=epochs, callbacks=[checkpointer], verbose=1)
#LSTM_model.fit(X_train1, y_train1, validation_split=0.05,
# epochs=epochs, callbacks=[checkpointer], verbose=1)
LSTM_model.load_weights('weights.best.from_scratch1.hdf5')
########## Validate model
y_validate_pred = LSTM_model.predict(X_validate1)
#comment for LSTM
#y_validate_pred = y_validate_pred.reshape(y_validate_pred.shape[0], y_validate_pred.shape[2])
y_validate_pred1 = pd.DataFrame(data = y_validate_pred, columns = list(y_validate.columns))
##inverse to counts of visits
y_val_pred = y_validate_pred1.apply(lambda x: np.exp(x)-1)
for i in list(y_val_pred.columns):
y_val_pred[i] = y_val_pred[i].apply(lambda x: round(x))
y_val_pred[y_val_pred<0]=0
print('y_val_pred after inverse: ', y_val_pred.head())
print('y_validate true: ', y_validate_true.head())
print('SAMPE of validate set when training on X_train: ', smape_test(y_validate_true.stack(), y_val_pred.stack()))
##LSTM 46.39/ 45.5
## benckmark: median of previous 60 days
y_train_true1 = y_train.copy()
y_train_true1['V'] = y_train_true.median(axis=1)
y_train_true1.head()
for i in list(y_validate_true.columns):
y_train_true1[i] = y_train_true1['V']
print(y_train_true1[y_train_true1.columns[-60:]].head())
smape_test(y_validate_true.stack(), y_train_true1[y_train_true1.columns[-60:]].stack())
days1 = range(550)
days2 = range(490,550)
def plot_pred(idx):
fig = plt.figure(1,figsize=(10,5))
plt.plot(days1,data.iloc[idx,1:], label = 'true y_validate')
plt.plot(days2, y_train_true1.iloc[idx,-60:], label = 'benchmark')
plt.xlabel('day')
plt.ylabel('views')
plt.title(data0.iloc[idx,0])
plt.legend()
plt.show()
for idx in [0, 5, 50, 10000, 50000]:
plot_pred(idx)
plot_pred(10021)
## to get 'Page'
X_test_P = data[['Page']+list(data.columns[-7:])]
print(X_test_P.head())
######## Predict
LSTM_model.load_weights('weights.best.from_scratch1.hdf5')
y_test_pred = LSTM_model.predict(X_test1)
out_cols = list(pd.unique(key_split['date'].values))
y_test_pred1 = pd.DataFrame(data = y_test_pred, columns = list(out_cols))
##Inverse if outlier removed
y_test_pred2 = y_test_pred1.apply(lambda x: np.exp(x)-1)
y_test_pred2[y_test_pred2<0] =0
###merge into Page
test_out = pd.merge(X_test_P, y_test_pred2, left_index = True, right_index = True)
test_out1 = test_out[['Page']+list(test_out.columns.values[-60:])]
print(test_out1.head())
test_out_120D = pd.melt(test_out1, id_vars='Page', var_name='date', value_name='Visits_120D')
test_out_120D.head()
#results_120D = key_split.merge(test_out2, how ='left')
### Retrain on X_validate
checkpointer = ModelCheckpoint(filepath='weights.best.from_scratch2.hdf5', verbose=1, save_best_only=True)
print('Start training...')
LSTM_model.fit(X_validate1, y_validate1, validation_split = 0.05,
epochs=epochs, callbacks=[checkpointer], verbose=1)
LSTM_model.load_weights('weights.best.from_scratch2.hdf5')
### check model on training data
y_train_pred = LSTM_model.predict(X_train1)
y_train_pred1 = pd.DataFrame(data = y_train_pred, columns = list(y_train.columns))
y_train_pred = y_train_pred1.apply(lambda x: np.exp(x)-1)
for i in list(y_train_pred.columns):
y_train_pred[i] = y_train_pred[i].apply(lambda x: round(x))
y_train_pred[y_train_pred<0]=0
print('SAMPE of train set when training on X_validate: ', smape_test(y_train_true.stack(),y_train_pred.stack()))
#45.52858730028269
######## Predict
LSTM_model.load_weights('weights.best.from_scratch2.hdf5')
y_test_pred = LSTM_model.predict(X_test1)
out_cols = list(pd.unique(key_split['date'].values))
y_test_pred1 = pd.DataFrame(data = y_test_pred, columns = list(out_cols))
##Inverse if outlier removed
y_test_pred2 = y_test_pred1.apply(lambda x: np.exp(x)-1)
y_test_pred2[y_test_pred2<0] =0
###merge into Page
test_out = pd.merge(X_test_P, y_test_pred2, left_index = True, right_index = True)
test_out1 = test_out[['Page']+list(test_out.columns.values[-60:])]
print(test_out1.head())
test_out_60D = pd.melt(test_out1, id_vars='Page', var_name='date', value_name='Visits_60D')
test_out_60D.head()
#results_60D = key_split.merge(test_out2, how ='left')
#### Combine two model results
results = test_out_120D.merge(test_out_60D)
results['Visits_Seq'] = results[['Visits_120D', 'Visits_60D']].mean(axis =1).apply(lambda x: round(x))
results.tail()
## Merge on 'Page' and 'date'
key_rnn_result = key_split.merge(results, how ='left')
key_rnn_result.head()
rnn_results = key_rnn_result[['Id','Visits_Seq']]
rnn_results.tail()
##### last week, daily
train_bench = data[['Page'] + list(data.columns[-7:])]
train_bench['last_Visits'] = train_bench[list(train_bench.columns[-7:])].median(axis=1)
train_bench_last_week = train_bench[['Page', 'last_Visits']]
results1 = key_split.merge(train_bench_last_week, how = 'left')
results1['date'] = results1['date'].astype('datetime64[ns]')
results1['weekend'] = (results1.date.dt.dayofweek).astype(float)
results1.tail()
##### Previous 3 weeks, DayofWeek
train_bench_3week = data[['Page'] + list(data.columns[-21:])]
train_3week_flattened = pd.melt(train_bench_3week, id_vars='Page', var_name='date', value_name='Visits_3week')
train_3week_flattened['date'] = train_3week_flattened['date'].astype('datetime64[ns]')
train_3week_flattened['weekend'] = (train_3week_flattened.date.dt.dayofweek).astype(float)
train_3week_m = train_3week_flattened.groupby(['Page','weekend']).median().reset_index()
train_3week_m['L3week_Visits'] = train_3week_m.Visits_3week
print(train_3week_m.head())
train_bench_3week = train_3week_m[['Page', 'weekend', 'L3week_Visits']]
train_bench_3week.tail()
train_bench_3week['Page'].tail(1)
results2 = results1.merge(train_bench_3week, how = 'left')
results2.tail()
### previous 9 weeks which are more than 2 months
train_bench_9week = data[['Page'] + list(data.columns[-63:])]
train_9week_flattened = pd.melt(train_bench_9week, id_vars='Page', var_name='date', value_name='Visits_9week')
train_9week_flattened['date'] = train_9week_flattened['date'].astype('datetime64[ns]')
train_9week_flattened['weekend'] = (train_9week_flattened.date.dt.dayofweek).astype(float)
train_9week_m = train_9week_flattened.groupby(['Page','weekend']).median().reset_index()
train_9week_m['L9week_Visits'] = train_9week_m.Visits_9week
print(train_9week_m.head())
train_bench_9week = train_9week_m[['Page', 'weekend', 'L9week_Visits']]
train_bench_9week.tail()
results4 = results2.merge(train_bench_9week, how = 'left')
results4.tail()
#### last year, 2016
train_bench_year = data[['Page'] + list(data.columns[-365:])]
train_year_flattened = pd.melt(train_bench_year, id_vars='Page', var_name='date', value_name='Visits_year')
train_year_flattened['date'] = train_year_flattened['date'].astype('datetime64[ns]')
train_year_flattened['weekend'] = (train_year_flattened.date.dt.dayofweek).astype(float)
train_year_m = train_year_flattened.groupby(['Page','weekend']).median().reset_index()
train_year_m['year_Visits'] = train_year_m.Visits_year
print(train_year_m.head())
train_bench_year = train_year_m[['Page', 'weekend', 'year_Visits']]
#print(train_bench_year.tail())
results6 = results4.merge(train_bench_year, how = 'left')
results6.tail()
### same month in previous years
train_bench_month = data[['Page', '2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
'2016-01-05', '2016-01-06', '2016-01-07', '2016-01-08',
'2016-01-09', '2016-01-10', '2016-01-11', '2016-01-12',
'2016-01-13', '2016-01-14', '2016-01-15', '2016-01-16',
'2016-01-17', '2016-01-18', '2016-01-19', '2016-01-20',
'2016-01-21', '2016-01-22', '2016-01-23', '2016-01-24',
'2016-01-25', '2016-01-26', '2016-01-27', '2016-01-28',
'2016-01-29', '2016-01-30', '2016-01-31', '2016-02-01',
'2016-02-02', '2016-02-03', '2016-02-04', '2016-02-05',
'2016-02-06', '2016-02-07', '2016-02-08', '2016-02-09',
'2016-02-10', '2016-02-11', '2016-02-12', '2016-02-13',
'2016-02-14', '2016-02-15', '2016-02-16', '2016-02-17',
'2016-02-18', '2016-02-19', '2016-02-20', '2016-02-21',
'2016-02-22', '2016-02-23', '2016-02-24', '2016-02-25',
'2016-02-26', '2016-02-27', '2016-02-28', '2016-02-29',
'2016-03-01']]
train_month_flattened = pd.melt(train_bench_month, id_vars='Page', var_name='date', value_name='Visits_month')
train_month_flattened['date'] = train_month_flattened['date'].astype('datetime64[ns]')
train_month_flattened['weekend'] = (train_month_flattened.date.dt.dayofweek).astype(float)
train_month_m = train_month_flattened.groupby(['Page','weekend']).median().reset_index()
train_month_m['Month_Visits'] = train_month_m.Visits_month
print(train_month_m.head())
train_bench_month = train_month_m[['Page', 'weekend', 'Month_Visits']]
train_bench_month.head()
results7 = results6.merge(train_bench_month, how = 'left')
results7.tail()
#### run if you want to check EDA results
results7['Visits_M'] = results7[['last_Visits', 'L3week_Visits', 'L9week_Visits', 'year_Visits', 'Month_Visits']].median(axis =1).apply(lambda x: round(x))
print(rnn_results.columns)
print(results7.columns)
final_results = rnn_results.merge(results7, how = 'left')
final_results['Visits'] = final_results[['Visits_Seq', 'last_Visits', 'L3week_Visits', 'L9week_Visits', 'year_Visits', 'Month_Visits']].median(axis =1).apply(lambda x: round(x))
#final_results[['Id','Visits']].to_csv('results_RNN_EDA.csv', index=False)
final_results.head()
anwer_key = pd.read_csv("answer_key_1_0.csv")
anwer_key.head()
### benckmark by using the median of views in the last 60 days
bench = data[['Page'] + list(data.columns[-60:])]
bench['last_Visits'] = bench[list(bench.columns[-60:])].median(axis=1)
benchmark = bench[['Page', 'last_Visits']]
results0 = key_split.merge(benchmark, how = 'left')
smape_test(anwer_key['Visits'], results0['last_Visits'])
#L9week_Visits
smape_test(anwer_key['Visits'], results7['L9week_Visits'])
smape_test(anwer_key['Visits'], rnn_results['Visits_Seq'])
### 45.97 when validate_split
### 45.89
smape_test(anwer_key['Visits'], results7['Visits_M'])
smape_test(anwer_key['Visits'], final_results['Visits'])
### The best!!
key_rnn_result.head()
smape_test(anwer_key['Visits'], key_rnn_result['Visits_60D'].apply(lambda x:round(x)))
smape_test(anwer_key['Visits'], key_rnn_result['Visits_120D'].apply(lambda x:round(x)))
pred_result = final_results[['Page', 'date', 'Visits']]
pred_result.head()
### data set including the true y_test
data2 = pd.read_csv('stage_2/train_2.csv')
data2.head(20)
data2 = data2.fillna(0)
benchmark.head()
for i in ['Visits' + str(i) for i in range(60)]:
benchmark[i] = benchmark['last_Visits']
final_results.columns
results7.columns
days1 = range(500,610)
days2 = range(550,610)
def plot_pred1(idx):
fig = plt.figure(1,figsize=(15,8))
print(data2.iloc[idx,0])
plt.plot(days1,data2.iloc[idx,1+500:611], label = 'true y_test')
plt.plot(days2, benchmark.iloc[idx,-60:], label = 'benchmark')
pred_V = pred_result[pred_result.Page==data2.iloc[idx,0]].Visits
plt.plot(days2, pred_V, label = 'final model')
pred_LSTM = final_results[final_results.Page==data2.iloc[idx,0]].Visits_Seq
plt.plot(days2, pred_LSTM, label = 'LSTM')
pred_M = results7[results7.Page==data2.iloc[idx,0]].Visits_M
plt.plot(days2, pred_M, label = 'Median of Medians')
plt.xlabel('day')
plt.ylabel('views')
#plt.title(data2.iloc[idx,0])
plt.legend()
plt.show()
for idx in [0, 5, 50, 5000, 10000, 50000, 1000, 2000, 3000, 4000,5500,6000,6500,7000,7500,8000,8500,9000,9500,10500,11000,11500,12000,12500,13000,13500,14000,14500,15000,15500,16000,16500,17000,17500,18000,18500,19000]:
plot_pred1(idx)